After first exercices I feel enthuastic and I am eager to learn more about open data science and R. I hope I will learn basics in coding and I hope I can create a platform that I can use in my future research protects. My GitHub repository link: https://github.com/iinatuomainen/IODS-project
#Exercise 2
Describe the work you have done this week and summarize your learning.
Let’s plot the data
learning2014 <- read.csv(file="~/IODS-project/data/learning2014")
pairs(learning2014[-1], col = learning2014$gender)
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(learning2014, mapping = aes(col=gender, alpha = 0.3), lower=list(combo=wrap("facethist", bins=20)))
my_model2 <- lm (points ~ attitude, data = learning2014)
par(mfrow = c(2,2))
plot(my_model2, which = c(1,2,5))
#Read both data into R. Exploring the stuctures and dimensions of the data
MAT <- read.csv(file = "~/IODS-project/data/mat")
POR <- read.csv(file = "~/IODS-project/data/por")
dim(MAT)
## [1] 395 34
dim(POR)
## [1] 649 34
#Join two data sets using variables “school”, “sex”, “age”, “address”, “famsize”, “Pstatus”, “Medu”, “Fedu”, “Mjob”, “Fjob”, “reason”, “nursery”,“internet”
join_by <- c("school", "sex", "age", "address", "famsize", "Pstatus", "Medu", "Fedu", "Mjob", "Fjob", "reason", "nursery","internet")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
math_por <- inner_join(MAT, POR, by = join_by, suffix = c(".MAT", ".POR"))
#The if-else structure (copy fro DataCamp execise)
alc <- select(math_por, one_of(join_by))
notjoined_columns <- colnames(MAT)[!colnames(MAT) %in% join_by]
notjoined_columns
## [1] "X" "guardian" "traveltime" "studytime" "failures"
## [6] "schoolsup" "famsup" "paid" "activities" "higher"
## [11] "romantic" "famrel" "freetime" "goout" "Dalc"
## [16] "Walc" "health" "absences" "G1" "G2"
## [21] "G3"
for(column_name in notjoined_columns) {
# select two columns from 'math_por' with the same original name
two_columns <- select(math_por, starts_with(column_name))
# select the first column vector of those two columns
first_column <- select(two_columns, 1)[[1]]
# if that first column vector is numeric...
if(is.numeric(first_column)) {
# take a rounded average of each row of the two columns and
# add the resulting vector to the alc data frame
alc[column_name] <- round(rowMeans(two_columns))
} else { # else if it's not numeric...
# add the first column vector to the alc data frame
alc[column_name] <- first_column
}
}
#Average of the answers related to weekday and weekend alcohol consumption to create a new column “alc_use”. Create “high_use”, where TRUE is for students for which “alc_use” is greater than 2, otherwise FALSE
alc <- mutate(alc, alc_use = (Dalc + Walc) / 2)
alc <- mutate(alc, high_use = alc_use > 2)
#Glimpse data and save data set to “data folder”
glimpse(alc)
## Observations: 382
## Variables: 36
## $ school <fct> GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, GP, G…
## $ sex <fct> F, F, F, F, F, M, M, F, M, M, F, F, M, M, M, F, F, F,…
## $ age <int> 18, 17, 15, 15, 16, 16, 16, 17, 15, 15, 15, 15, 15, 1…
## $ address <fct> U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,…
## $ famsize <fct> GT3, GT3, LE3, GT3, GT3, LE3, LE3, GT3, LE3, GT3, GT3…
## $ Pstatus <fct> A, T, T, T, T, T, T, A, A, T, T, T, T, T, A, T, T, T,…
## $ Medu <int> 4, 1, 1, 4, 3, 4, 2, 4, 3, 3, 4, 2, 4, 4, 2, 4, 4, 3,…
## $ Fedu <int> 4, 1, 1, 2, 3, 3, 2, 4, 2, 4, 4, 1, 4, 3, 2, 4, 4, 3,…
## $ Mjob <fct> at_home, at_home, at_home, health, other, services, o…
## $ Fjob <fct> teacher, other, other, services, other, other, other,…
## $ reason <fct> course, course, other, home, home, reputation, home, …
## $ nursery <fct> yes, no, yes, yes, yes, yes, yes, yes, yes, yes, yes,…
## $ internet <fct> no, yes, yes, yes, no, yes, yes, no, yes, yes, yes, y…
## $ X <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16…
## $ guardian <fct> mother, father, mother, mother, father, mother, mothe…
## $ traveltime <dbl> 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 3, 1, 2, 1, 1, 1, 3,…
## $ studytime <dbl> 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 3, 1, 3, 2,…
## $ failures <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ schoolsup <fct> yes, no, yes, no, no, no, no, yes, no, no, no, no, no…
## $ famsup <fct> no, yes, no, yes, yes, yes, no, yes, yes, yes, yes, y…
## $ paid <fct> no, no, yes, yes, yes, yes, no, no, yes, yes, yes, no…
## $ activities <fct> no, no, no, yes, no, yes, no, no, no, yes, no, yes, y…
## $ higher <fct> yes, yes, yes, yes, yes, yes, yes, yes, yes, yes, yes…
## $ romantic <fct> no, no, no, yes, no, no, no, no, no, no, no, no, no, …
## $ famrel <dbl> 4, 5, 4, 3, 4, 5, 4, 4, 4, 5, 3, 5, 4, 5, 4, 4, 3, 5,…
## $ freetime <dbl> 3, 3, 3, 2, 3, 4, 4, 1, 2, 5, 3, 2, 3, 4, 5, 4, 2, 3,…
## $ goout <dbl> 4, 3, 2, 2, 2, 2, 4, 4, 2, 1, 3, 2, 3, 3, 2, 4, 3, 2,…
## $ Dalc <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ Walc <dbl> 1, 1, 3, 1, 2, 2, 1, 1, 1, 1, 2, 1, 3, 2, 1, 2, 2, 1,…
## $ health <dbl> 3, 3, 3, 5, 5, 5, 3, 1, 1, 5, 2, 4, 5, 3, 3, 2, 2, 4,…
## $ absences <dbl> 5, 3, 8, 1, 2, 8, 0, 4, 0, 0, 1, 2, 1, 1, 0, 5, 8, 3,…
## $ G1 <dbl> 2, 7, 10, 14, 8, 14, 12, 8, 16, 13, 12, 10, 13, 11, 1…
## $ G2 <dbl> 8, 8, 10, 14, 12, 14, 12, 9, 17, 14, 11, 12, 14, 11, …
## $ G3 <dbl> 8, 8, 11, 14, 12, 14, 12, 10, 18, 14, 12, 12, 13, 12,…
## $ alc_use <dbl> 1.0, 1.0, 2.5, 1.0, 1.5, 1.5, 1.0, 1.0, 1.0, 1.0, 1.5…
## $ high_use <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE…
write.csv(alc, file = "~/IODS-project/data/alc")
#Read data and print out the names of the varibles in the data
colnames(alc)
## [1] "school" "sex" "age" "address" "famsize"
## [6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
## [11] "reason" "nursery" "internet" "X" "guardian"
## [16] "traveltime" "studytime" "failures" "schoolsup" "famsup"
## [21] "paid" "activities" "higher" "romantic" "famrel"
## [26] "freetime" "goout" "Dalc" "Walc" "health"
## [31] "absences" "G1" "G2" "G3" "alc_use"
## [36] "high_use"
dim(alc)
## [1] 382 36
#Data includes 35 variables from 385 observations. Data have information about students’ school (2 different schools), their parents’ education and work, information about shcool success, information about students’ freetime. In addition, data includes information about students’ alcohol consumption.
#Study the relationships between high/low alcohol consumption and other variables in the data. I choose variables such as mother education, activities, higher and health. My hypothesis is that higher mother education and higher current health status are associated with low alcohol consumption. Moreover, extra-curricular activities (yes) and students who wants to take higher education are related to lower alcohol consumption.
#Explore numerically and graphically the distributions of variables and their relationships with alcohol consumption.
mean(alc$Medu)
## [1] 2.806283
mean(alc$health)
## [1] 3.573298
mean(alc$alc_use)
## [1] 1.888743
summary(alc$higher)
## no yes
## 18 364
summary(alc$activities)
## no yes
## 181 201
summary(alc$Medu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 2.806 4.000 4.000
summary(alc$health)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 3.000 4.000 3.573 5.000 5.000
summary(alc$alc_use)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.500 1.889 2.500 5.000
summary(alc$high_use)
## Mode FALSE TRUE
## logical 268 114
alc %>% group_by(Medu, high_use) %>% summarise(count=n(), mean_edu=mean(Medu))
## # A tibble: 10 x 4
## # Groups: Medu [5]
## Medu high_use count mean_edu
## <int> <lgl> <int> <dbl>
## 1 0 FALSE 1 0
## 2 0 TRUE 2 0
## 3 1 FALSE 33 1
## 4 1 TRUE 18 1
## 5 2 FALSE 80 2
## 6 2 TRUE 18 2
## 7 3 FALSE 59 3
## 8 3 TRUE 36 3
## 9 4 FALSE 95 4
## 10 4 TRUE 40 4
#Median of alcohol use is 1.5 and 112 of the patients alcohol consumption is more than point 2 from 1 to 5 scale during one week. Mean of students’ mother eduaction is 2.8, which is close to 3 as secundary education. Mean of students’ current health status is 3.57, which is average from scale 1 from 5. 201 students have extra-curricular activities and 364 students wants to take higher education.
#Logistic regression analyses
m1 <- glm(high_use ~ Medu + activities + higher + health, data = alc, family = "binomial")
summary(m1)
##
## Call:
## glm(formula = high_use ~ Medu + activities + higher + health,
## family = "binomial", data = alc)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3009 -0.8594 -0.7891 1.4521 1.7288
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.43862 0.61980 -0.708 0.4791
## Medu 0.06602 0.10623 0.622 0.5343
## activitiesyes -0.22969 0.22812 -1.007 0.3140
## higheryes -0.86198 0.49672 -1.735 0.0827 .
## health 0.09205 0.08229 1.119 0.2633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 465.68 on 381 degrees of freedom
## Residual deviance: 459.87 on 377 degrees of freedom
## AIC: 469.87
##
## Number of Fisher Scoring iterations: 4
OR <- coef(m1) %>% exp
CI <- confint(m1)%>% exp
## Waiting for profiling to be done...
cbind(OR, CI)
## OR 2.5 % 97.5 %
## (Intercept) 0.6449236 0.1884639 2.180536
## Medu 1.0682517 0.8686646 1.318535
## activitiesyes 0.7947811 0.5076034 1.243070
## higheryes 0.4223248 0.1571527 1.133624
## health 1.0964173 0.9346660 1.291396
#None of the selected variables were significantly associated with the higher alcohol use consumption. #Coefficient values for variabes were: Medu 1.07 (95% CI 0.2-2.2), activities (yes) 0.79 (0.9-1.3), higher (yes) 0.42 (0.2-1.1) and health 1.09 (0.9-1.3). If the associations were statistically significant, if the higher health or mother education values grow by one unit, the probability that students would be higher alcohol consumption are 6% for Medu and 9% for health. If student wants to higher education (yes) or she/he have extra-curricular activities (yes), the probability to be lower alcohol use would be 86 % for higher education and 22 % for activity. #Coefficient value is odds ratios between a unit change in explanatory variable. That means if healht change for one unit, the probability to be higher alcohol user would be 9 %. If the predictor variable is binary: if student replied yes for higher education question, she or he has a 86 % probability to be LOWER alcohol consumption user. Again these are not true because of missing statistically significant p-values.
#I am using the variable mother education. #2x2 cross tabulation of predictors versus the actual values
library(ggplot2); library (dplyr)
table(high_use=alc$high_use, Medu=alc$Medu) %>% prop.table() %>% addmargins()
## Medu
## high_use 0 1 2 3 4
## FALSE 0.002617801 0.086387435 0.209424084 0.154450262 0.248691099
## TRUE 0.005235602 0.047120419 0.047120419 0.094240838 0.104712042
## Sum 0.007853403 0.133507853 0.256544503 0.248691099 0.353403141
## Medu
## high_use Sum
## FALSE 0.701570681
## TRUE 0.298429319
## Sum 1.000000000
#predict the probability of high_use
probabilities <- predict(m1, type = "response")
#add the predicted probabilities to “alc”
alc <- mutate(alc, probability = probabilities)
#use the probabilities to make prediction of high_use
alc <- mutate(alc, prediction = probability > 0.5)
#tabulate the target variable versus predictions
table(high_use = alc$high_use, prediction = alc$prediction)
## prediction
## high_use FALSE TRUE
## FALSE 263 5
## TRUE 111 3
#initialize a plot of ‘high_use’ versus ‘probability’ in ‘alc’
g <- ggplot(alc, aes(x = probability, y = high_use, col=prediction))
#define the geom as points and draw the plot
g + geom_point()
#tabulate the target variable versus the predictions
table(high_use = alc$high_use, prediction = alc$prediction) %>% prop.table() %>% addmargins()
## prediction
## high_use FALSE TRUE Sum
## FALSE 0.688481675 0.013089005 0.701570681
## TRUE 0.290575916 0.007853403 0.298429319
## Sum 0.979057592 0.020942408 1.000000000
#I guess data shows 263 true negative values and 3 true positive valus. Likewise, there is 111 false negative and 5 false positive values.
#define a loss function (average prediction error)
loss_func <- function(class, prob) {
n_wrong <- abs(class - prob) > 0.5
mean(n_wrong)
}
#compute the average number of wrong predictions in the (training) data
loss_func(class = alc$high_use, prob = alc$probability)
## [1] 0.3036649
#K-fold cross-validation
library(boot)
cv <- cv.glm(data = alc, cost = loss_func, glmfit = m1, K = nrow(alc))
#average number of wrong predictions in the cross validation
cv$delta[1]
## [1] 0.3115183
#The mean prediction error in my model is 0.30, therefore my model has worse test set performance. Lower number is better for better model.
#Load data and explore the structure and the dimensions of the data.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
data("Boston")
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
dim(Boston)
## [1] 506 14
#Data Boston includes 506 observations and 14 variables. Most of the variables are numerical and one categirical variable.
#Show a graphical overview of the data and sho summaries of the variables in the data
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08204 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
library(corrplot)
## corrplot 0.84 loaded
m <- cor(Boston)
corrplot(m, method = "circle")
#One of the most positive correlation may be between nitrogen oxides concentration and proportion of non-retail business acres per town. Whereas, one of the most negative relationship is between age and weighted mean of distance to five boston employment centers.
#Standardize the dataset and print out summaries of the scaled data.
boston_scaled <- scale(Boston)
summary(boston_scaled)
## crim zn indus
## Min. :-0.419367 Min. :-0.48724 Min. :-1.5563
## 1st Qu.:-0.410563 1st Qu.:-0.48724 1st Qu.:-0.8668
## Median :-0.390280 Median :-0.48724 Median :-0.2109
## Mean : 0.000000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.007389 3rd Qu.: 0.04872 3rd Qu.: 1.0150
## Max. : 9.924110 Max. : 3.80047 Max. : 2.4202
## chas nox rm age
## Min. :-0.2723 Min. :-1.4644 Min. :-3.8764 Min. :-2.3331
## 1st Qu.:-0.2723 1st Qu.:-0.9121 1st Qu.:-0.5681 1st Qu.:-0.8366
## Median :-0.2723 Median :-0.1441 Median :-0.1084 Median : 0.3171
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:-0.2723 3rd Qu.: 0.5981 3rd Qu.: 0.4823 3rd Qu.: 0.9059
## Max. : 3.6648 Max. : 2.7296 Max. : 3.5515 Max. : 1.1164
## dis rad tax ptratio
## Min. :-1.2658 Min. :-0.9819 Min. :-1.3127 Min. :-2.7047
## 1st Qu.:-0.8049 1st Qu.:-0.6373 1st Qu.:-0.7668 1st Qu.:-0.4876
## Median :-0.2790 Median :-0.5225 Median :-0.4642 Median : 0.2746
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.6617 3rd Qu.: 1.6596 3rd Qu.: 1.5294 3rd Qu.: 0.8058
## Max. : 3.9566 Max. : 1.6596 Max. : 1.7964 Max. : 1.6372
## black lstat medv
## Min. :-3.9033 Min. :-1.5296 Min. :-1.9063
## 1st Qu.: 0.2049 1st Qu.:-0.7986 1st Qu.:-0.5989
## Median : 0.3808 Median :-0.1811 Median :-0.1449
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4332 3rd Qu.: 0.6024 3rd Qu.: 0.2683
## Max. : 0.4406 Max. : 3.5453 Max. : 2.9865
boston_scaled <- as.data.frame(boston_scaled)
#Means of variables are now 0.00 in every variable.
#Create a categorial variable of the crime rate
summary(boston_scaled$crim)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.419367 -0.410563 -0.390280 0.000000 0.007389 9.924110
bins <- quantile(boston_scaled$crim)
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))
#Drop the olf crime rate variable from the data set
boston_scaled <- dplyr::select(boston_scaled, -crim)
boston_scaled <- data.frame(boston_scaled, crime)
#Divide the dataset to train and test sets
n <- nrow(boston_scaled)
ind <- sample(n, size = n*0.8)
train <- boston_scaled[ind, ]
test <- boston_scaled
#Fit the linear discriminant analysis on the train set.
lda.fit <- lda(crime ~ ., data=train)
#Draw the LDA plot
classes <- as.numeric(train$crime)
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
heads <- coef(x)
arrows(x0 = 0, y0 = 0,
x1 = myscale * heads[,choices[1]],
y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
text(myscale * heads[,choices], labels = row.names(heads),
cex = tex, col=color, pos=3)
}
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)
#Save the crime categories from the test set and then remove the categorical crime variable from the dataset.
crime_cat <- test$crime
test <- dplyr::select(test, -crime)
summary(test)
## zn indus chas nox
## Min. :-0.48724 Min. :-1.5563 Min. :-0.2723 Min. :-1.4644
## 1st Qu.:-0.48724 1st Qu.:-0.8668 1st Qu.:-0.2723 1st Qu.:-0.9121
## Median :-0.48724 Median :-0.2109 Median :-0.2723 Median :-0.1441
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.04872 3rd Qu.: 1.0150 3rd Qu.:-0.2723 3rd Qu.: 0.5981
## Max. : 3.80047 Max. : 2.4202 Max. : 3.6648 Max. : 2.7296
## rm age dis rad
## Min. :-3.8764 Min. :-2.3331 Min. :-1.2658 Min. :-0.9819
## 1st Qu.:-0.5681 1st Qu.:-0.8366 1st Qu.:-0.8049 1st Qu.:-0.6373
## Median :-0.1084 Median : 0.3171 Median :-0.2790 Median :-0.5225
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4823 3rd Qu.: 0.9059 3rd Qu.: 0.6617 3rd Qu.: 1.6596
## Max. : 3.5515 Max. : 1.1164 Max. : 3.9566 Max. : 1.6596
## tax ptratio black lstat
## Min. :-1.3127 Min. :-2.7047 Min. :-3.9033 Min. :-1.5296
## 1st Qu.:-0.7668 1st Qu.:-0.4876 1st Qu.: 0.2049 1st Qu.:-0.7986
## Median :-0.4642 Median : 0.2746 Median : 0.3808 Median :-0.1811
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 1.5294 3rd Qu.: 0.8058 3rd Qu.: 0.4332 3rd Qu.: 0.6024
## Max. : 1.7964 Max. : 1.6372 Max. : 0.4406 Max. : 3.5453
## medv
## Min. :-1.9063
## 1st Qu.:-0.5989
## Median :-0.1449
## Mean : 0.0000
## 3rd Qu.: 0.2683
## Max. : 2.9865
#Predict the classes with the LDA model on the test data. Cross tabulate the results.
lda.pred <- predict(lda.fit, newdata = test)
table(correct = crime_cat, predicted = lda.pred$class)
## predicted
## correct low med_low med_high high
## low 69 52 6 0
## med_low 17 93 16 0
## med_high 1 45 74 6
## high 0 0 1 126
#Reload the Boston dataset and standardize the dataset. Calculate the distances between the observations. Run k-means and investigates the optimal number of clusters. Visualize the clustes.
data('Boston')
Boston2 <- scale(Boston)
dist_eu <- dist(Boston2)
summary(dist_eu)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1343 3.4625 4.8241 4.9111 6.1863 14.3970
set.seed(123)
k_max <- 10
twcss <- sapply(1:k_max, function(k){kmeans(Boston2, k)$tot.withinss})
library(ggplot2)
qplot(x=1:k_max, y=twcss, geom = 'line')
km <- kmeans(Boston2, centers = 2)
pairs(Boston2, col= km$cluster)
#In the qplot picture, the twcss dramatically drops in number 2, therefore 2 is optimal of clusters
#Data source:
hd <- read.csv("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/human_development.csv", stringsAsFactors = F)
gii <- read.csv("http://s3.amazonaws.com/assets.datacamp.com/production/course_2218/datasets/gender_inequality.csv", stringsAsFactors = F, na.strings = "..")
names(hd) [1]<- "HDI"
names(hd) [3]<- "Human.HDI"
names(hd) [4]<- "Life.exp"
names(hd) [5]<- "Years.exp"
names(hd) [6]<- "Years.mean"
names(hd) [7]<- "GNI"
names(hd) [8]<- "GNI.calc"
names (gii) [1]<- "GII"
names (gii) [3]<- "gender.GII"
names (gii) [4]<- "mater.mor"
names (gii) [5]<- "adol.birth"
names (gii) [6]<- "rep.parl"
names (gii) [7] <- "sec.edu.F"
names (gii) [8] <- "sec.edu.M"
names (gii) [9] <- "lab.F"
names (gii) [10] <- "lab.M"
gii$edu2.FM <- gii$sec.edu.F/gii$sec.edu.M
gii$lab.FM <- gii$lab.F/gii$lab.M
names(gii)
## [1] "GII" "Country" "gender.GII" "mater.mor" "adol.birth"
## [6] "rep.parl" "sec.edu.F" "sec.edu.M" "lab.F" "lab.M"
## [11] "edu2.FM" "lab.FM"
human <- merge(gii, hd, inner_join=Country)
dim(human)
## [1] 195 19
write.csv(human, file = "~/IODS-project/data/human")
read.csv(file = "~/IODS-project/data/human")
## X Country GII gender.GII mater.mor
## 1 1 Afghanistan 171 0.693 400
## 2 2 Albania 85 0.217 21
## 3 3 Algeria 83 0.413 89
## 4 4 Andorra 34 NA NA
## 5 5 Angola 149 NA 460
## 6 6 Antigua and Barbuda 58 NA NA
## 7 7 Arab States NA 0.537 155
## 8 8 Argentina 40 0.376 69
## 9 9 Armenia 85 0.318 29
## 10 10 Australia 2 0.110 6
## 11 11 Austria 23 0.053 4
## 12 12 Azerbaijan 78 0.303 26
## 13 13 Bahamas 55 0.298 37
## 14 14 Bahrain 45 0.265 22
## 15 15 Bangladesh 142 0.503 170
## 16 16 Barbados 57 0.357 52
## 17 17 Belarus 50 0.151 1
## 18 18 Belgium 21 0.063 6
## 19 19 Belize 101 0.426 45
## 20 20 Benin 166 0.614 340
## 21 21 Bhutan 132 0.457 120
## 22 22 Bolivia (Plurinational State of) 119 0.444 200
## 23 23 Bosnia and Herzegovina 85 0.201 8
## 24 24 Botswana 106 0.480 170
## 25 25 Brazil 75 0.457 69
## 26 26 Brunei Darussalam 31 NA 27
## 27 27 Bulgaria 59 0.212 5
## 28 28 Burkina Faso 183 0.631 400
## 29 29 Burundi 184 0.492 740
## 30 30 Cabo Verde 122 NA 53
## 31 31 Cambodia 143 0.477 170
## 32 32 Cameroon 153 0.587 590
## 33 33 Canada 9 0.129 11
## 34 34 Central African Republic 187 0.655 880
## 35 35 Chad 185 0.706 980
## 36 36 Chile 42 0.338 22
## 37 37 China 90 0.191 32
## 38 38 Colombia 97 0.429 83
## 39 39 Comoros 159 NA 350
## 40 40 Congo 136 0.593 410
## 41 41 Congo (Democratic Republic of the) 176 0.673 730
## 42 42 Costa Rica 69 0.349 38
## 43 43 Côte d'Ivoire 172 0.679 720
## 44 44 Croatia 47 0.149 13
## 45 45 Cuba 67 0.356 80
## 46 46 Cyprus 32 0.124 10
## 47 47 Czech Republic 28 0.091 5
## 48 48 Denmark 4 0.048 5
## 49 49 Djibouti 168 NA 230
## 50 50 Dominica 94 NA NA
## 51 51 Dominican Republic 101 0.477 100
## 52 52 East Asia and the Pacific NA 0.328 72
## 53 53 Ecuador 88 0.407 87
## 54 54 Egypt 108 0.573 45
## 55 55 El Salvador 116 0.427 69
## 56 56 Equatorial Guinea 138 NA 290
## 57 57 Eritrea 186 NA 380
## 58 58 Estonia 30 0.164 11
## 59 59 Ethiopia 174 0.558 420
## 60 60 Europe and Central Asia NA 0.300 28
## 61 61 Fiji 90 0.418 59
## 62 62 Finland 24 0.075 4
## 63 63 France 22 0.088 12
## 64 64 Gabon 110 0.514 240
## 65 65 Gambia 175 0.622 430
## 66 66 Georgia 76 0.382 41
## 67 67 Germany 6 0.041 7
## 68 68 Ghana 140 0.554 380
## 69 69 Greece 29 0.146 5
## 70 70 Grenada 79 NA 23
## 71 71 Guatemala 128 0.533 140
## 72 72 Guinea 182 NA 650
## 73 73 Guinea-Bissau 178 NA 560
## 74 74 Guyana 124 0.515 250
## 75 75 Haiti 163 0.603 380
## 76 76 Honduras 131 0.480 120
## 77 77 Hong Kong, China (SAR) 12 NA NA
## 78 78 Hungary 44 0.209 14
## 79 79 Iceland 16 0.087 4
## 80 80 India 130 0.563 190
## 81 81 Indonesia 110 0.494 190
## 82 82 Iran (Islamic Republic of) 69 0.515 23
## 83 83 Iraq 121 0.539 67
## 84 84 Ireland 6 0.113 9
## 85 85 Israel 18 0.101 2
## 86 86 Italy 27 0.068 4
## 87 87 Jamaica 99 0.430 80
## 88 88 Japan 20 0.133 6
## 89 89 Jordan 80 0.473 50
## 90 90 Kazakhstan 56 0.267 26
## 91 91 Kenya 145 0.552 400
## 92 92 Kiribati 137 NA 130
## 93 93 Korea (Republic of) 17 0.125 27
## 94 94 Kuwait 48 0.387 14
## 95 95 Kyrgyzstan 120 0.353 75
## 96 96 Lao People's Democratic Republic 141 NA NA
## 97 97 Latin America and the Caribbean NA 0.415 85
## 98 98 Latvia 46 0.167 13
## 99 99 Lebanon 67 0.385 16
## 100 100 Lesotho 161 0.541 490
## 101 101 Liberia 177 0.651 640
## 102 102 Libya 94 0.134 15
## 103 103 Liechtenstein 13 NA NA
## 104 104 Lithuania 37 0.125 11
## 105 105 Luxembourg 19 0.100 11
## 106 106 Madagascar 154 NA 440
## 107 107 Malawi 173 0.611 510
## 108 108 Malaysia 62 0.209 29
## 109 109 Maldives 104 0.243 31
## 110 110 Mali 179 0.677 550
## 111 111 Malta 37 0.227 9
## 112 112 Mauritania 156 0.610 320
## 113 113 Mauritius 63 0.419 73
## 114 114 Mexico 74 0.373 49
## 115 115 Micronesia (Federated States of) 123 NA 96
## 116 116 Moldova (Republic of) 107 0.248 21
## 117 117 Mongolia 90 0.325 68
## 118 118 Montenegro 49 0.171 7
## 119 119 Morocco 126 0.525 120
## 120 120 Mozambique 180 0.591 480
## 121 121 Myanmar 148 0.413 200
## 122 122 Namibia 126 0.401 130
## 123 123 Nepal 145 0.489 190
## 124 124 Netherlands 5 0.062 6
## 125 125 New Zealand 9 0.157 8
## 126 126 Nicaragua 125 0.449 100
## 127 127 Niger 188 0.713 630
## 128 128 Nigeria 152 NA 560
## 129 129 Norway 1 0.067 4
## 130 130 Oman 52 0.275 11
## 131 131 Pakistan 147 0.536 170
## 132 132 Palau 60 NA NA
## 133 133 Palestine, State of 113 NA NA
## 134 134 Panama 60 0.454 85
## 135 135 Papua New Guinea 158 0.611 220
## 136 136 Paraguay 112 0.472 110
## 137 137 Peru 84 0.406 89
## 138 138 Philippines 115 0.420 120
## 139 139 Poland 36 0.138 3
## 140 140 Portugal 43 0.111 8
## 141 141 Qatar 32 0.524 6
## 142 142 Romania 52 0.333 33
## 143 143 Russian Federation 50 0.276 24
## 144 144 Rwanda 163 0.400 320
## 145 145 Saint Kitts and Nevis 77 NA NA
## 146 146 Saint Lucia 89 NA 34
## 147 147 Saint Vincent and the Grenadines 97 NA 45
## 148 148 Samoa 105 0.457 58
## 149 149 Sao Tome and Principe 143 NA 210
## 150 150 Saudi Arabia 39 0.284 16
## 151 151 Senegal 170 0.528 320
## 152 152 Serbia 66 0.176 16
## 153 153 Seychelles 64 NA NA
## 154 154 Sierra Leone 181 0.650 1100
## 155 155 Singapore 11 0.088 6
## 156 156 Slovakia 35 0.164 7
## 157 157 Slovenia 25 0.016 7
## 158 158 Solomon Islands 156 NA 130
## 159 159 South Africa 116 0.407 140
## 160 160 South Asia NA 0.536 183
## 161 161 South Sudan 169 NA 730
## 162 162 Spain 26 0.095 4
## 163 163 Sri Lanka 73 0.370 29
## 164 164 Sub-Saharan Africa NA 0.575 506
## 165 165 Sudan 167 0.591 360
## 166 166 Suriname 103 0.463 130
## 167 167 Swaziland 150 0.557 310
## 168 168 Sweden 14 0.055 4
## 169 169 Switzerland 3 0.028 6
## 170 170 Syrian Arab Republic 134 0.533 49
## 171 171 Tajikistan 129 0.357 44
## 172 172 Tanzania (United Republic of) 151 0.547 410
## 173 173 Thailand 93 0.380 26
## 174 174 The former Yugoslav Republic of Macedonia 81 0.164 7
## 175 175 Timor-Leste 133 NA 270
## 176 176 Togo 162 0.588 450
## 177 177 Tonga 100 0.666 120
## 178 178 Trinidad and Tobago 64 0.371 84
## 179 179 Tunisia 96 0.240 46
## 180 180 Turkey 72 0.359 20
## 181 181 Turkmenistan 109 NA 61
## 182 182 Uganda 163 0.538 360
## 183 183 Ukraine 81 0.286 23
## 184 184 United Arab Emirates 41 0.232 8
## 185 185 United Kingdom 14 0.177 8
## 186 186 United States 8 0.280 28
## 187 187 Uruguay 52 0.313 14
## 188 188 Uzbekistan 114 NA 36
## 189 189 Vanuatu 134 NA 86
## 190 190 Venezuela (Bolivarian Republic of) 71 0.476 110
## 191 191 Viet Nam 116 0.308 49
## 192 192 World NA 0.449 210
## 193 193 Yemen 160 0.744 270
## 194 194 Zambia 139 0.587 280
## 195 195 Zimbabwe 155 0.504 470
## adol.birth rep.parl sec.edu.F sec.edu.M lab.F lab.M edu2.FM
## 1 86.8 27.6 5.9 29.8 15.8 79.5 0.1979866
## 2 15.3 20.7 81.8 87.9 44.9 65.5 0.9306030
## 3 10.0 25.7 26.7 31.0 15.2 72.2 0.8612903
## 4 NA 50.0 49.5 49.3 NA NA 1.0040568
## 5 170.2 36.8 NA NA 63.3 76.9 NA
## 6 49.3 25.7 NA NA NA NA NA
## 7 45.4 14.0 34.7 47.6 23.2 75.3 0.7289916
## 8 54.4 36.8 56.3 57.6 47.5 75.0 0.9774306
## 9 27.1 10.7 94.0 95.0 54.2 72.6 0.9894737
## 10 12.1 30.5 94.3 94.6 58.8 71.8 0.9968288
## 11 4.1 30.3 100.0 100.0 54.6 67.7 1.0000000
## 12 40.0 15.6 93.7 97.4 62.9 69.6 0.9620123
## 13 28.5 16.7 91.2 87.6 69.3 79.3 1.0410959
## 14 13.8 15.0 56.7 51.4 39.2 86.9 1.1031128
## 15 80.6 20.0 34.1 41.3 57.4 84.1 0.8256659
## 16 48.4 19.6 89.5 87.7 65.9 76.6 1.0205245
## 17 20.6 30.1 87.0 92.2 50.1 63.1 0.9436009
## 18 6.7 42.4 77.5 82.9 47.5 59.3 0.9348613
## 19 71.4 13.3 76.4 75.8 49.2 82.3 1.0079156
## 20 90.2 8.4 11.3 27.0 67.6 78.3 0.4185185
## 21 40.9 8.3 34.0 34.5 66.7 77.2 0.9855072
## 22 71.9 51.8 47.6 59.1 64.2 80.9 0.8054146
## 23 15.1 19.3 44.9 69.8 34.1 57.3 0.6432665
## 24 44.2 9.5 73.6 77.9 71.9 81.6 0.9448010
## 25 70.8 9.6 54.6 52.4 59.4 80.8 1.0419847
## 26 23.0 NA 63.9 67.8 52.6 75.3 0.9424779
## 27 35.9 20.4 93.0 95.7 47.9 59.0 0.9717868
## 28 115.4 13.3 0.9 3.2 77.1 90.0 0.2812500
## 29 30.3 34.9 5.3 8.3 83.3 82.0 0.6385542
## 30 70.6 20.8 NA NA 51.5 83.7 NA
## 31 44.3 19.0 9.9 22.9 78.8 86.5 0.4323144
## 32 115.8 27.1 21.3 34.9 63.8 76.8 0.6103152
## 33 14.5 28.2 100.0 100.0 61.6 71.0 1.0000000
## 34 98.3 12.5 10.1 26.7 72.6 85.1 0.3782772
## 35 152.0 14.9 1.7 9.9 64.0 79.2 0.1717172
## 36 55.3 15.8 73.3 76.4 49.2 74.8 0.9594241
## 37 8.6 23.6 58.7 71.9 63.9 78.3 0.8164117
## 38 68.5 20.9 56.9 55.6 55.8 79.7 1.0233813
## 39 51.1 3.0 NA NA 35.2 80.1 NA
## 40 126.7 11.5 39.7 47.0 68.5 73.0 0.8446809
## 41 135.3 8.2 12.8 32.4 70.7 73.2 0.3950617
## 42 60.8 33.3 50.7 50.5 46.6 79.0 1.0039604
## 43 130.3 9.2 14.0 30.1 52.4 81.4 0.4651163
## 44 12.7 25.8 85.0 93.6 44.7 58.4 0.9081197
## 45 43.1 48.9 74.3 78.8 43.4 70.0 0.9428934
## 46 5.5 12.5 76.0 81.7 56.0 71.1 0.9302326
## 47 4.9 18.9 99.9 99.7 51.1 68.3 1.0020060
## 48 5.1 38.0 95.5 96.6 58.7 66.4 0.9886128
## 49 18.6 12.7 NA NA 36.3 67.7 NA
## 50 NA 21.9 29.7 23.2 NA NA 1.2801724
## 51 99.6 19.1 55.6 53.1 51.3 78.6 1.0470810
## 52 21.2 18.7 54.7 66.3 62.6 79.4 0.8250377
## 53 77.0 41.6 40.1 39.4 54.7 82.7 1.0177665
## 54 43.0 2.2 43.9 60.6 23.7 74.8 0.7244224
## 55 76.0 27.4 36.8 43.6 47.8 79.0 0.8440367
## 56 112.6 19.7 NA NA 80.7 92.2 NA
## 57 65.3 22.0 NA NA 80.0 89.8 NA
## 58 16.8 19.8 100.0 100.0 56.2 68.9 1.0000000
## 59 78.4 25.5 7.8 18.2 78.2 89.3 0.4285714
## 60 30.8 19.0 70.8 80.6 45.6 70.0 0.8784119
## 61 42.8 14.0 64.2 64.5 37.5 72.0 0.9953488
## 62 9.2 42.5 100.0 100.0 55.7 64.0 1.0000000
## 63 5.7 25.7 78.0 83.2 50.7 61.6 0.9375000
## 64 103.0 16.2 53.9 36.1 56.2 65.4 1.4930748
## 65 115.8 9.4 17.4 31.5 72.2 82.9 0.5523810
## 66 46.8 11.3 89.7 92.7 56.5 75.1 0.9676375
## 67 3.8 36.9 96.3 97.0 53.6 66.4 0.9927835
## 68 58.4 10.9 45.2 64.7 67.3 71.4 0.6986090
## 69 11.9 21.0 59.5 67.0 44.2 62.5 0.8880597
## 70 35.4 25.0 NA NA NA NA NA
## 71 97.2 13.3 21.9 23.2 49.3 88.2 0.9439655
## 72 131.0 21.9 NA NA 65.6 78.3 NA
## 73 99.3 13.7 NA NA 68.2 78.5 NA
## 74 88.5 31.3 60.3 47.8 42.6 80.5 1.2615063
## 75 42.0 3.5 22.4 35.2 60.9 71.0 0.6363636
## 76 84.0 25.8 28.0 25.8 42.8 82.9 1.0852713
## 77 3.3 NA 72.2 79.2 51.3 67.8 0.9116162
## 78 12.1 10.1 97.9 98.7 44.8 60.0 0.9918946
## 79 11.5 41.3 91.0 91.6 70.5 77.4 0.9934498
## 80 32.8 12.2 27.0 56.6 27.0 79.9 0.4770318
## 81 48.3 17.1 39.9 49.2 51.4 84.2 0.8109756
## 82 31.6 3.1 62.2 67.6 16.6 73.6 0.9201183
## 83 68.7 26.5 27.8 50.2 14.9 69.8 0.5537849
## 84 8.2 19.9 80.5 78.6 53.1 68.1 1.0241730
## 85 7.8 22.5 84.4 87.3 57.9 69.1 0.9667812
## 86 4.0 30.1 71.2 80.5 39.6 59.5 0.8844720
## 87 70.1 16.7 74.0 70.2 56.1 70.9 1.0541311
## 88 5.4 11.6 87.0 85.8 48.8 70.4 1.0139860
## 89 26.5 11.6 69.5 78.5 15.6 66.6 0.8853503
## 90 29.9 20.1 95.3 98.8 67.7 77.9 0.9645749
## 91 93.6 20.8 25.3 31.4 62.2 72.4 0.8057325
## 92 16.6 8.7 NA NA NA NA NA
## 93 2.2 16.3 77.0 89.1 50.1 72.1 0.8641975
## 94 14.5 1.5 55.6 56.3 43.6 83.1 0.9875666
## 95 29.3 23.3 94.5 96.8 56.0 79.5 0.9762397
## 96 65.0 25.0 22.9 37.0 76.3 79.1 0.6189189
## 97 68.3 27.0 54.3 55.2 53.7 79.8 0.9836957
## 98 13.5 18.0 98.9 99.0 54.9 67.6 0.9989899
## 99 12.0 3.1 53.0 55.4 23.3 70.9 0.9566787
## 100 89.4 26.8 21.9 19.0 59.0 73.5 1.1526316
## 101 117.4 10.7 15.4 39.3 58.2 64.8 0.3918575
## 102 2.5 16.0 55.5 41.9 30.0 76.4 1.3245823
## 103 NA 20.0 NA NA NA NA NA
## 104 10.6 23.4 89.1 94.3 55.8 67.3 0.9448568
## 105 8.3 28.3 100.0 100.0 50.7 64.6 1.0000000
## 106 122.8 20.5 NA NA 86.6 90.5 NA
## 107 144.8 16.7 11.1 21.6 84.6 81.5 0.5138889
## 108 5.7 14.2 65.1 71.3 44.4 75.5 0.9130435
## 109 4.2 5.9 27.3 32.7 56.2 77.5 0.8348624
## 110 175.6 9.5 7.7 15.1 50.8 81.4 0.5099338
## 111 18.2 13.0 68.6 78.2 37.9 66.3 0.8772379
## 112 73.3 22.2 8.3 20.9 28.7 79.1 0.3971292
## 113 30.9 11.6 49.4 58.0 43.6 74.2 0.8517241
## 114 63.4 37.1 55.7 60.6 45.1 79.9 0.9191419
## 115 18.6 0.0 NA NA NA NA NA
## 116 29.3 20.8 93.6 96.6 37.6 44.2 0.9689441
## 117 18.7 14.9 85.3 84.1 56.6 69.3 1.0142687
## 118 15.2 17.3 84.2 94.7 43.0 57.3 0.8891235
## 119 35.8 11.0 20.7 30.2 26.5 75.8 0.6854305
## 120 137.8 39.6 1.4 6.2 85.5 82.8 0.2258065
## 121 12.1 4.7 22.9 15.3 75.2 82.3 1.4967320
## 122 54.9 37.7 33.3 34.4 54.7 63.7 0.9680233
## 123 73.7 29.5 17.7 38.2 79.9 87.1 0.4633508
## 124 6.2 36.9 87.7 90.5 58.5 70.6 0.9690608
## 125 25.3 31.4 95.0 95.3 62.0 73.8 0.9968520
## 126 100.8 39.1 39.4 38.3 47.4 80.3 1.0287206
## 127 204.8 13.3 2.4 7.8 40.0 89.7 0.3076923
## 128 119.6 6.6 NA NA 48.2 63.7 NA
## 129 7.8 39.6 97.4 96.7 61.2 68.7 1.0072389
## 130 10.6 9.6 47.2 57.1 29.0 82.6 0.8266200
## 131 27.3 19.7 19.3 46.1 24.6 82.9 0.4186551
## 132 NA 10.3 NA NA NA NA NA
## 133 45.8 NA 53.9 59.4 15.4 66.4 0.9074074
## 134 78.5 19.3 54.0 49.9 49.0 81.8 1.0821643
## 135 62.1 2.7 7.6 14.5 70.5 74.0 0.5241379
## 136 67.0 16.8 36.8 43.0 55.7 84.8 0.8558140
## 137 50.7 22.3 56.3 66.1 68.2 84.4 0.8517398
## 138 46.8 27.1 65.9 63.7 51.1 79.7 1.0345369
## 139 12.2 22.1 79.4 85.5 48.9 64.9 0.9286550
## 140 12.6 31.3 47.7 48.2 54.9 66.2 0.9896266
## 141 9.5 0.0 66.7 59.0 50.8 95.5 1.1305085
## 142 31.0 12.0 86.1 92.0 48.7 64.9 0.9358696
## 143 25.7 14.5 89.6 92.5 57.1 71.7 0.9686486
## 144 33.6 57.5 8.0 8.8 86.4 85.3 0.9090909
## 145 NA 6.7 NA NA NA NA NA
## 146 56.3 20.7 NA NA 62.7 76.2 NA
## 147 54.5 13.0 NA NA 55.7 78.0 NA
## 148 28.3 6.1 64.3 60.0 23.5 58.4 1.0716667
## 149 65.1 18.2 NA NA 45.3 77.8 NA
## 150 10.2 19.9 60.5 70.3 20.2 78.3 0.8605974
## 151 94.4 42.7 7.2 15.4 66.0 88.0 0.4675325
## 152 16.9 34.0 58.4 73.6 44.5 60.9 0.7934783
## 153 56.3 43.8 66.9 66.6 NA NA 1.0045045
## 154 100.7 12.4 10.0 21.7 65.7 69.0 0.4608295
## 155 6.0 25.3 74.1 81.0 58.8 77.2 0.9148148
## 156 15.9 18.7 99.1 99.5 51.1 68.6 0.9959799
## 157 0.6 27.7 95.8 98.0 52.3 63.2 0.9775510
## 158 64.9 2.0 NA NA 53.4 79.0 NA
## 159 50.9 40.7 72.7 75.9 44.5 60.5 0.9578393
## 160 38.7 17.5 29.1 54.6 29.8 80.3 0.5329670
## 161 75.3 24.3 NA NA NA NA NA
## 162 10.6 38.0 66.8 73.1 52.5 65.8 0.9138167
## 163 16.9 5.8 72.7 76.4 35.1 76.3 0.9515707
## 164 109.7 22.5 22.1 31.5 65.4 76.6 0.7015873
## 165 84.0 23.8 12.1 18.2 31.3 76.0 0.6648352
## 166 35.2 11.8 44.6 47.1 40.5 68.8 0.9469214
## 167 72.0 14.7 21.9 26.0 43.9 71.6 0.8423077
## 168 6.5 43.6 86.5 87.3 60.3 67.9 0.9908362
## 169 1.9 28.5 95.0 96.6 61.8 74.9 0.9834369
## 170 41.6 12.4 29.5 40.5 13.5 72.7 0.7283951
## 171 42.8 15.2 95.1 91.2 58.9 77.1 1.0427632
## 172 122.7 36.0 5.6 9.5 88.1 90.2 0.5894737
## 173 41.0 6.1 35.7 40.8 64.3 80.7 0.8750000
## 174 18.3 33.3 40.2 55.6 43.1 67.5 0.7230216
## 175 52.2 38.5 NA NA 24.6 50.8 NA
## 176 91.5 17.6 16.1 40.3 80.6 81.3 0.3995037
## 177 18.1 0.0 87.5 88.3 53.5 74.6 0.9909400
## 178 34.8 24.7 59.7 60.9 53.0 75.5 0.9802956
## 179 4.6 31.3 32.8 46.1 25.1 70.9 0.7114967
## 180 30.9 14.4 39.0 60.0 29.4 70.8 0.6500000
## 181 18.0 25.8 NA NA 46.9 76.9 NA
## 182 126.6 35.0 22.9 33.5 75.8 79.2 0.6835821
## 183 25.7 11.8 91.7 95.9 53.2 66.9 0.9562044
## 184 27.6 17.5 73.1 61.2 46.5 92.0 1.1944444
## 185 25.8 23.5 99.8 99.9 55.7 68.7 0.9989990
## 186 31.0 19.4 95.1 94.8 56.3 68.9 1.0031646
## 187 58.3 11.5 54.4 50.3 55.6 76.8 1.0815109
## 188 38.8 16.4 NA NA 48.1 75.6 NA
## 189 44.8 0.0 NA NA 61.5 80.0 NA
## 190 83.2 17.0 56.6 50.8 51.1 79.2 1.1141732
## 191 29.0 24.3 59.4 71.2 73.0 82.2 0.8342697
## 192 47.4 21.8 54.5 65.4 50.3 76.7 0.8333333
## 193 47.0 0.7 8.6 26.7 25.4 72.2 0.3220974
## 194 125.4 12.7 25.8 44.0 73.1 85.6 0.5863636
## 195 60.3 35.1 48.7 62.0 83.2 89.7 0.7854839
## lab.FM HDI Human.HDI Life.exp Years.exp Years.mean GNI GNI.calc
## 1 0.1987421 171 0.465 60.4 9.3 3.2 1,885 -7
## 2 0.6854962 85 0.733 77.8 11.8 9.3 9,943 14
## 3 0.2105263 83 0.736 74.8 14.0 7.6 13,054 -1
## 4 NA 34 0.845 81.3 13.5 9.6 43,978 -18
## 5 0.8231469 149 0.532 52.3 11.4 4.7 6,822 -30
## 6 NA 58 0.783 76.1 14.0 9.2 20,070 -1
## 7 0.3081009 NA 0.686 70.6 12.0 6.4 15,722 NA
## 8 0.6333333 40 0.836 76.3 17.9 9.8 22,050 11
## 9 0.7465565 85 0.733 74.7 12.3 10.9 8,124 22
## 10 0.8189415 2 0.935 82.4 20.2 13.0 42,261 17
## 11 0.8064993 23 0.885 81.4 15.7 10.8 43,869 -5
## 12 0.9037356 78 0.751 70.8 11.9 11.2 16,428 -11
## 13 0.8738966 55 0.790 75.4 12.6 10.9 21,336 -3
## 14 0.4510932 45 0.824 76.6 14.4 9.4 38,599 -20
## 15 0.6825208 142 0.570 71.6 10.0 5.1 3,191 5
## 16 0.8603133 57 0.785 75.6 15.4 10.5 12,488 27
## 17 0.7939778 50 0.798 71.3 15.7 12.0 16,676 14
## 18 0.8010118 21 0.890 80.8 16.3 11.3 41,187 0
## 19 0.5978129 101 0.715 70.0 13.6 10.5 7,614 9
## 20 0.8633461 166 0.480 59.6 11.1 3.3 1,767 0
## 21 0.8639896 132 0.605 69.5 12.6 3.0 7,176 -17
## 22 0.7935723 119 0.662 68.3 13.2 8.2 5,760 4
## 23 0.5951134 85 0.733 76.5 13.6 8.3 9,638 19
## 24 0.8811275 106 0.698 64.5 12.5 8.9 16,646 -41
## 25 0.7351485 75 0.755 74.5 15.2 7.7 15,175 -1
## 26 0.6985392 31 0.856 78.8 14.5 8.8 72,570 -26
## 27 0.8118644 59 0.782 74.2 14.4 10.6 15,596 13
## 28 0.8566667 183 0.402 58.7 7.8 1.4 1,591 -13
## 29 1.0158537 184 0.400 56.7 10.1 2.7 758 1
## 30 0.6152927 122 0.646 73.3 13.5 4.7 6,094 -1
## 31 0.9109827 143 0.555 68.4 10.9 4.4 2,949 7
## 32 0.8307292 153 0.512 55.5 10.4 6.0 2,803 -1
## 33 0.8676056 9 0.913 82.0 15.9 13.0 42,155 11
## 34 0.8531140 187 0.350 50.7 7.2 4.2 581 1
## 35 0.8080808 185 0.392 51.6 7.4 1.9 2,085 -22
## 36 0.6577540 42 0.832 81.7 15.2 9.8 21,290 11
## 37 0.8160920 90 0.727 75.8 13.1 7.5 12,547 -7
## 38 0.7001255 97 0.720 74.0 13.5 7.3 12,040 -9
## 39 0.4394507 159 0.503 63.3 11.5 4.6 1,456 16
## 40 0.9383562 136 0.591 62.3 11.1 6.1 6,012 -14
## 41 0.9658470 176 0.433 58.7 9.8 6.0 680 11
## 42 0.5898734 69 0.766 79.4 13.9 8.4 13,413 10
## 43 0.6437346 172 0.462 51.5 8.9 4.3 3,171 -24
## 44 0.7654110 47 0.818 77.3 14.8 11.0 19,409 11
## 45 0.6200000 67 0.769 79.4 13.8 11.5 7,301 47
## 46 0.7876231 32 0.850 80.2 14.0 11.6 28,633 3
## 47 0.7481698 28 0.870 78.6 16.4 12.3 26,660 10
## 48 0.8840361 4 0.923 80.2 18.7 12.7 44,025 11
## 49 0.5361891 168 0.470 62.0 6.4 3.8 3,276 -22
## 50 NA 94 0.724 77.8 12.7 7.9 9,994 4
## 51 0.6526718 101 0.715 73.5 13.1 7.6 11,883 -12
## 52 0.7884131 NA 0.710 74.0 12.7 7.5 11,449 NA
## 53 0.6614268 88 0.732 75.9 14.2 7.6 10,605 7
## 54 0.3168449 108 0.690 71.1 13.5 6.6 10,512 -12
## 55 0.6050633 116 0.666 73.0 12.3 6.5 7,349 -3
## 56 0.8752711 138 0.587 57.6 9.0 5.5 21,056 -84
## 57 0.8908686 186 0.391 63.7 4.1 3.9 1,130 -6
## 58 0.8156749 30 0.861 76.8 16.5 12.5 25,214 12
## 59 0.8756999 174 0.442 64.1 8.5 2.4 1,428 2
## 60 0.6514286 NA 0.748 72.3 13.6 10.0 12,791 NA
## 61 0.5208333 90 0.727 70.0 15.7 9.9 7,493 21
## 62 0.8703125 24 0.883 80.8 17.1 10.3 38,695 0
## 63 0.8230519 22 0.888 82.2 16.0 11.1 38,056 4
## 64 0.8593272 110 0.684 64.4 12.5 7.8 16,367 -42
## 65 0.8709288 175 0.441 60.2 8.8 2.8 1,507 -2
## 66 0.7523302 76 0.754 74.9 13.8 12.1 7,164 40
## 67 0.8072289 6 0.916 80.9 16.5 13.1 43,919 11
## 68 0.9425770 140 0.579 61.4 11.5 7.0 3,852 -1
## 69 0.7072000 29 0.865 80.9 17.6 10.3 24,524 14
## 70 NA 79 0.750 73.4 15.8 8.6 10,939 14
## 71 0.5589569 128 0.627 71.8 10.7 5.6 6,929 -11
## 72 0.8378033 182 0.411 58.8 8.7 2.4 1,096 0
## 73 0.8687898 178 0.420 55.2 9.0 2.8 1,362 -1
## 74 0.5291925 124 0.636 66.4 10.3 8.5 6,522 -4
## 75 0.8577465 163 0.483 62.8 8.7 4.9 1,669 4
## 76 0.5162847 131 0.606 73.1 11.1 5.5 3,938 7
## 77 0.7566372 12 0.910 84.0 15.6 11.2 53,959 -2
## 78 0.7466667 44 0.828 75.2 15.4 11.6 22,916 3
## 79 0.9108527 16 0.899 82.6 19.0 10.6 35,182 12
## 80 0.3379224 130 0.609 68.0 11.7 5.4 5,497 -4
## 81 0.6104513 110 0.684 68.9 13.0 7.6 9,788 -9
## 82 0.2255435 69 0.766 75.4 15.1 8.2 15,440 4
## 83 0.2134670 121 0.654 69.4 10.1 6.4 14,003 -44
## 84 0.7797357 6 0.916 80.9 18.6 12.2 39,568 16
## 85 0.8379161 18 0.894 82.4 16.0 12.5 30,676 16
## 86 0.6655462 27 0.873 83.1 16.0 10.1 33,030 4
## 87 0.7912553 99 0.719 75.7 12.4 9.7 7,415 13
## 88 0.6931818 20 0.891 83.5 15.3 11.5 36,927 7
## 89 0.2342342 80 0.748 74.0 13.5 9.9 11,365 11
## 90 0.8690629 56 0.788 69.4 15.0 11.4 20,867 -1
## 91 0.8591160 145 0.548 61.6 11.0 6.3 2,762 9
## 92 NA 137 0.590 66.0 12.3 7.8 2,434 21
## 93 0.6948682 17 0.898 81.9 16.9 11.9 33,890 13
## 94 0.5246691 48 0.816 74.4 14.7 7.2 83,961 -46
## 95 0.7044025 120 0.655 70.6 12.5 10.6 3,044 29
## 96 0.9646018 141 0.575 66.2 10.6 5.0 4,680 -6
## 97 0.6729323 NA 0.748 75.0 14.0 8.2 14,242 NA
## 98 0.8121302 46 0.819 74.2 15.2 11.5 22,281 4
## 99 0.3286319 67 0.769 79.3 13.8 7.9 16,509 -1
## 100 0.8027211 161 0.497 49.8 11.1 5.9 3,306 -16
## 101 0.8981481 177 0.430 60.9 9.5 4.1 805 7
## 102 0.3926702 94 0.724 71.6 14.0 7.3 14,911 -19
## 103 NA 13 0.908 80.0 15.0 11.8 79,851 -10
## 104 0.8291233 37 0.839 73.3 16.4 12.4 24,500 7
## 105 0.7848297 19 0.892 81.7 13.9 11.7 58,711 -11
## 106 0.9569061 154 0.510 65.1 10.3 6.0 1,328 24
## 107 1.0380368 173 0.445 62.8 10.8 4.3 747 13
## 108 0.5880795 62 0.779 74.7 12.7 10.0 22,762 -14
## 109 0.7251613 104 0.706 76.8 13.0 5.8 12,328 -19
## 110 0.6240786 179 0.419 58.0 8.4 2.0 1,583 -8
## 111 0.5716440 37 0.839 80.6 14.4 10.3 27,930 -1
## 112 0.3628319 156 0.506 63.1 8.5 3.8 3,560 -14
## 113 0.5876011 63 0.777 74.4 15.6 8.5 17,470 0
## 114 0.5644556 74 0.756 76.8 13.1 8.5 16,056 -4
## 115 NA 123 0.640 69.1 11.7 9.7 3,432 21
## 116 0.8506787 107 0.693 71.6 11.9 11.2 5,223 23
## 117 0.8167388 90 0.727 69.4 14.6 9.3 10,729 4
## 118 0.7504363 49 0.802 76.2 15.2 11.2 14,558 27
## 119 0.3496042 126 0.628 74.0 11.6 4.4 6,850 -8
## 120 1.0326087 180 0.416 55.1 9.3 3.2 1,123 1
## 121 0.9137303 148 0.536 65.9 8.6 4.1 4,608 -12
## 122 0.8587127 126 0.628 64.8 11.3 6.2 9,418 -21
## 123 0.9173364 145 0.548 69.6 12.4 3.3 2,311 16
## 124 0.8286119 5 0.922 81.6 17.9 11.9 45,435 9
## 125 0.8401084 9 0.913 81.8 19.2 12.5 32,689 23
## 126 0.5902864 125 0.631 74.9 11.5 6.0 4,457 12
## 127 0.4459309 188 0.348 61.4 5.4 1.5 908 -5
## 128 0.7566719 152 0.514 52.8 9.0 5.9 5,341 -24
## 129 0.8908297 1 0.944 81.6 17.5 12.6 64,992 5
## 130 0.3510896 52 0.793 76.8 13.6 8.0 34,858 -23
## 131 0.2967431 147 0.538 66.2 7.8 4.7 4,866 -14
## 132 NA 60 0.780 72.7 13.7 12.3 13,496 18
## 133 0.2319277 113 0.677 72.9 13.0 8.9 4,699 21
## 134 0.5990220 60 0.780 77.6 13.3 9.3 18,192 1
## 135 0.9527027 158 0.505 62.6 9.9 4.0 2,463 -1
## 136 0.6568396 112 0.679 72.9 11.9 7.7 7,643 -3
## 137 0.8080569 84 0.734 74.6 13.1 9.0 11,015 8
## 138 0.6411543 115 0.668 68.2 11.3 8.9 7,915 -7
## 139 0.7534669 36 0.843 77.4 15.5 11.8 23,177 10
## 140 0.8293051 43 0.830 80.9 16.3 8.2 25,757 -2
## 141 0.5319372 32 0.850 78.2 13.8 9.1 123,124 -31
## 142 0.7503852 52 0.793 74.7 14.2 10.8 18,108 10
## 143 0.7963738 50 0.798 70.1 14.7 12.0 22,352 -1
## 144 1.0128957 163 0.483 64.2 10.3 3.7 1,458 11
## 145 NA 77 0.752 73.8 12.9 8.4 20,805 -21
## 146 0.8228346 89 0.729 75.1 12.6 9.3 9,765 14
## 147 0.7141026 97 0.720 72.9 13.4 8.6 9,937 3
## 148 0.4023973 105 0.702 73.4 12.9 10.3 5,327 24
## 149 0.5822622 143 0.555 66.5 11.3 4.7 2,918 8
## 150 0.2579821 39 0.837 74.3 16.3 8.7 52,821 -27
## 151 0.7500000 170 0.466 66.5 7.9 2.5 2,188 -8
## 152 0.7307061 66 0.771 74.9 14.4 10.5 12,190 20
## 153 NA 64 0.772 73.1 13.4 9.4 23,300 -19
## 154 0.9521739 181 0.413 50.9 8.6 3.1 1,780 -16
## 155 0.7616580 11 0.912 83.0 15.4 10.6 76,628 -7
## 156 0.7448980 35 0.844 76.3 15.1 12.2 25,845 5
## 157 0.8275316 25 0.880 80.4 16.8 11.9 27,852 12
## 158 0.6759494 156 0.506 67.9 9.2 5.0 1,540 16
## 159 0.7355372 116 0.666 57.4 13.6 9.9 12,122 -29
## 160 0.3711083 NA 0.607 68.4 11.2 5.5 5,605 NA
## 161 NA 169 0.467 55.7 7.6 5.4 2,332 -9
## 162 0.7978723 26 0.876 82.6 17.3 9.6 32,045 7
## 163 0.4600262 73 0.757 74.9 13.7 10.8 9,779 29
## 164 0.8537859 NA 0.518 58.5 9.6 5.2 3,363 NA
## 165 0.4118421 167 0.479 63.5 7.0 3.1 3,809 -27
## 166 0.5886628 103 0.714 71.1 12.7 7.7 15,617 -32
## 167 0.6131285 150 0.531 49.0 11.3 7.1 5,542 -25
## 168 0.8880707 14 0.907 82.2 15.8 12.1 45,636 -1
## 169 0.8251001 3 0.930 83.0 15.8 12.8 56,431 6
## 170 0.1856946 134 0.594 69.6 12.3 6.3 2,728 21
## 171 0.7639429 129 0.624 69.4 11.2 10.4 2,517 27
## 172 0.9767184 151 0.521 65.0 9.2 5.1 2,411 8
## 173 0.7967782 93 0.726 74.4 13.5 7.3 13,323 -13
## 174 0.6385185 81 0.747 75.4 13.4 9.3 11,780 9
## 175 0.4842520 133 0.595 68.2 11.7 4.4 5,363 -6
## 176 0.9913899 162 0.484 59.7 12.2 4.5 1,228 17
## 177 0.7171582 100 0.717 72.8 14.7 10.7 5,069 32
## 178 0.7019868 64 0.772 70.4 12.3 10.9 26,090 -25
## 179 0.3540197 96 0.721 74.8 14.6 6.8 10,404 1
## 180 0.4152542 72 0.761 75.3 14.5 7.6 18,677 -12
## 181 0.6098830 109 0.688 65.6 10.8 9.9 13,066 -28
## 182 0.9570707 163 0.483 58.5 9.8 5.4 1,613 6
## 183 0.7952167 81 0.747 71.0 15.1 11.3 8,178 25
## 184 0.5054348 41 0.835 77.0 13.3 9.5 60,868 -34
## 185 0.8107715 14 0.907 80.7 16.2 13.1 39,267 9
## 186 0.8171263 8 0.915 79.1 16.5 12.9 52,947 3
## 187 0.7239583 52 0.793 77.2 15.5 8.5 19,283 7
## 188 0.6362434 114 0.675 68.4 11.5 10.9 5,567 10
## 189 0.7687500 134 0.594 71.9 10.6 6.8 2,803 19
## 190 0.6452020 71 0.762 74.2 14.2 8.9 16,159 -2
## 191 0.8880779 116 0.666 75.8 11.9 7.5 5,092 15
## 192 0.6558018 NA 0.711 71.5 12.2 7.9 14,301 NA
## 193 0.3518006 160 0.498 63.8 9.2 2.6 3,519 -17
## 194 0.8539720 139 0.586 60.1 13.5 6.6 3,734 2
## 195 0.9275362 155 0.509 57.5 10.9 7.3 1,615 13
#Data includes 195 observations and 19 variables. There is two characters variables, four intervals and the others are numeric.
#Mutate the Gross National Income (GrossNat) variable into numeric:
library(stringr)
library(dplyr)
str(human$GNI)
## chr [1:195] "1,885" "9,943" "13,054" "43,978" "6,822" "20,070" ...
human <- mutate(human, GNI = str_replace(human$GNI, pattern=",", replace ="") %>% as.numeric)
str(human$GNI)
## num [1:195] 1885 9943 13054 43978 6822 ...
#Exclude unneeded variables
names(human)
## [1] "Country" "GII" "gender.GII" "mater.mor" "adol.birth"
## [6] "rep.parl" "sec.edu.F" "sec.edu.M" "lab.F" "lab.M"
## [11] "edu2.FM" "lab.FM" "HDI" "Human.HDI" "Life.exp"
## [16] "Years.exp" "Years.mean" "GNI" "GNI.calc"
keep <- c("Country", "edu2.FM", "lab.FM", "Life.exp", "Years.exp", "GNI", "mater.mor", "adol.birth", "rep.parl")
human <- dplyr::select(human, one_of(keep))
#Remove all rows with missing values
complete.cases(human)
## [1] TRUE TRUE TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [34] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
## [56] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [100] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [111] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## [133] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## [155] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [177] TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [188] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
data.frame(human [-1], comp=complete.cases(human))
## edu2.FM lab.FM Life.exp Years.exp GNI mater.mor adol.birth
## 1 0.1979866 0.1987421 60.4 9.3 1885 400 86.8
## 2 0.9306030 0.6854962 77.8 11.8 9943 21 15.3
## 3 0.8612903 0.2105263 74.8 14.0 13054 89 10.0
## 4 1.0040568 NA 81.3 13.5 43978 NA NA
## 5 NA 0.8231469 52.3 11.4 6822 460 170.2
## 6 NA NA 76.1 14.0 20070 NA 49.3
## 7 0.7289916 0.3081009 70.6 12.0 15722 155 45.4
## 8 0.9774306 0.6333333 76.3 17.9 22050 69 54.4
## 9 0.9894737 0.7465565 74.7 12.3 8124 29 27.1
## 10 0.9968288 0.8189415 82.4 20.2 42261 6 12.1
## 11 1.0000000 0.8064993 81.4 15.7 43869 4 4.1
## 12 0.9620123 0.9037356 70.8 11.9 16428 26 40.0
## 13 1.0410959 0.8738966 75.4 12.6 21336 37 28.5
## 14 1.1031128 0.4510932 76.6 14.4 38599 22 13.8
## 15 0.8256659 0.6825208 71.6 10.0 3191 170 80.6
## 16 1.0205245 0.8603133 75.6 15.4 12488 52 48.4
## 17 0.9436009 0.7939778 71.3 15.7 16676 1 20.6
## 18 0.9348613 0.8010118 80.8 16.3 41187 6 6.7
## 19 1.0079156 0.5978129 70.0 13.6 7614 45 71.4
## 20 0.4185185 0.8633461 59.6 11.1 1767 340 90.2
## 21 0.9855072 0.8639896 69.5 12.6 7176 120 40.9
## 22 0.8054146 0.7935723 68.3 13.2 5760 200 71.9
## 23 0.6432665 0.5951134 76.5 13.6 9638 8 15.1
## 24 0.9448010 0.8811275 64.5 12.5 16646 170 44.2
## 25 1.0419847 0.7351485 74.5 15.2 15175 69 70.8
## 26 0.9424779 0.6985392 78.8 14.5 72570 27 23.0
## 27 0.9717868 0.8118644 74.2 14.4 15596 5 35.9
## 28 0.2812500 0.8566667 58.7 7.8 1591 400 115.4
## 29 0.6385542 1.0158537 56.7 10.1 758 740 30.3
## 30 NA 0.6152927 73.3 13.5 6094 53 70.6
## 31 0.4323144 0.9109827 68.4 10.9 2949 170 44.3
## 32 0.6103152 0.8307292 55.5 10.4 2803 590 115.8
## 33 1.0000000 0.8676056 82.0 15.9 42155 11 14.5
## 34 0.3782772 0.8531140 50.7 7.2 581 880 98.3
## 35 0.1717172 0.8080808 51.6 7.4 2085 980 152.0
## 36 0.9594241 0.6577540 81.7 15.2 21290 22 55.3
## 37 0.8164117 0.8160920 75.8 13.1 12547 32 8.6
## 38 1.0233813 0.7001255 74.0 13.5 12040 83 68.5
## 39 NA 0.4394507 63.3 11.5 1456 350 51.1
## 40 0.8446809 0.9383562 62.3 11.1 6012 410 126.7
## 41 0.3950617 0.9658470 58.7 9.8 680 730 135.3
## 42 1.0039604 0.5898734 79.4 13.9 13413 38 60.8
## 43 0.4651163 0.6437346 51.5 8.9 3171 720 130.3
## 44 0.9081197 0.7654110 77.3 14.8 19409 13 12.7
## 45 0.9428934 0.6200000 79.4 13.8 7301 80 43.1
## 46 0.9302326 0.7876231 80.2 14.0 28633 10 5.5
## 47 1.0020060 0.7481698 78.6 16.4 26660 5 4.9
## 48 0.9886128 0.8840361 80.2 18.7 44025 5 5.1
## 49 NA 0.5361891 62.0 6.4 3276 230 18.6
## 50 1.2801724 NA 77.8 12.7 9994 NA NA
## 51 1.0470810 0.6526718 73.5 13.1 11883 100 99.6
## 52 0.8250377 0.7884131 74.0 12.7 11449 72 21.2
## 53 1.0177665 0.6614268 75.9 14.2 10605 87 77.0
## 54 0.7244224 0.3168449 71.1 13.5 10512 45 43.0
## 55 0.8440367 0.6050633 73.0 12.3 7349 69 76.0
## 56 NA 0.8752711 57.6 9.0 21056 290 112.6
## 57 NA 0.8908686 63.7 4.1 1130 380 65.3
## 58 1.0000000 0.8156749 76.8 16.5 25214 11 16.8
## 59 0.4285714 0.8756999 64.1 8.5 1428 420 78.4
## 60 0.8784119 0.6514286 72.3 13.6 12791 28 30.8
## 61 0.9953488 0.5208333 70.0 15.7 7493 59 42.8
## 62 1.0000000 0.8703125 80.8 17.1 38695 4 9.2
## 63 0.9375000 0.8230519 82.2 16.0 38056 12 5.7
## 64 1.4930748 0.8593272 64.4 12.5 16367 240 103.0
## 65 0.5523810 0.8709288 60.2 8.8 1507 430 115.8
## 66 0.9676375 0.7523302 74.9 13.8 7164 41 46.8
## 67 0.9927835 0.8072289 80.9 16.5 43919 7 3.8
## 68 0.6986090 0.9425770 61.4 11.5 3852 380 58.4
## 69 0.8880597 0.7072000 80.9 17.6 24524 5 11.9
## 70 NA NA 73.4 15.8 10939 23 35.4
## 71 0.9439655 0.5589569 71.8 10.7 6929 140 97.2
## 72 NA 0.8378033 58.8 8.7 1096 650 131.0
## 73 NA 0.8687898 55.2 9.0 1362 560 99.3
## 74 1.2615063 0.5291925 66.4 10.3 6522 250 88.5
## 75 0.6363636 0.8577465 62.8 8.7 1669 380 42.0
## 76 1.0852713 0.5162847 73.1 11.1 3938 120 84.0
## 77 0.9116162 0.7566372 84.0 15.6 53959 NA 3.3
## 78 0.9918946 0.7466667 75.2 15.4 22916 14 12.1
## 79 0.9934498 0.9108527 82.6 19.0 35182 4 11.5
## 80 0.4770318 0.3379224 68.0 11.7 5497 190 32.8
## 81 0.8109756 0.6104513 68.9 13.0 9788 190 48.3
## 82 0.9201183 0.2255435 75.4 15.1 15440 23 31.6
## 83 0.5537849 0.2134670 69.4 10.1 14003 67 68.7
## 84 1.0241730 0.7797357 80.9 18.6 39568 9 8.2
## 85 0.9667812 0.8379161 82.4 16.0 30676 2 7.8
## 86 0.8844720 0.6655462 83.1 16.0 33030 4 4.0
## 87 1.0541311 0.7912553 75.7 12.4 7415 80 70.1
## 88 1.0139860 0.6931818 83.5 15.3 36927 6 5.4
## 89 0.8853503 0.2342342 74.0 13.5 11365 50 26.5
## 90 0.9645749 0.8690629 69.4 15.0 20867 26 29.9
## 91 0.8057325 0.8591160 61.6 11.0 2762 400 93.6
## 92 NA NA 66.0 12.3 2434 130 16.6
## 93 0.8641975 0.6948682 81.9 16.9 33890 27 2.2
## 94 0.9875666 0.5246691 74.4 14.7 83961 14 14.5
## 95 0.9762397 0.7044025 70.6 12.5 3044 75 29.3
## 96 0.6189189 0.9646018 66.2 10.6 4680 NA 65.0
## 97 0.9836957 0.6729323 75.0 14.0 14242 85 68.3
## 98 0.9989899 0.8121302 74.2 15.2 22281 13 13.5
## 99 0.9566787 0.3286319 79.3 13.8 16509 16 12.0
## 100 1.1526316 0.8027211 49.8 11.1 3306 490 89.4
## 101 0.3918575 0.8981481 60.9 9.5 805 640 117.4
## 102 1.3245823 0.3926702 71.6 14.0 14911 15 2.5
## 103 NA NA 80.0 15.0 79851 NA NA
## 104 0.9448568 0.8291233 73.3 16.4 24500 11 10.6
## 105 1.0000000 0.7848297 81.7 13.9 58711 11 8.3
## 106 NA 0.9569061 65.1 10.3 1328 440 122.8
## 107 0.5138889 1.0380368 62.8 10.8 747 510 144.8
## 108 0.9130435 0.5880795 74.7 12.7 22762 29 5.7
## 109 0.8348624 0.7251613 76.8 13.0 12328 31 4.2
## 110 0.5099338 0.6240786 58.0 8.4 1583 550 175.6
## 111 0.8772379 0.5716440 80.6 14.4 27930 9 18.2
## 112 0.3971292 0.3628319 63.1 8.5 3560 320 73.3
## 113 0.8517241 0.5876011 74.4 15.6 17470 73 30.9
## 114 0.9191419 0.5644556 76.8 13.1 16056 49 63.4
## 115 NA NA 69.1 11.7 3432 96 18.6
## 116 0.9689441 0.8506787 71.6 11.9 5223 21 29.3
## 117 1.0142687 0.8167388 69.4 14.6 10729 68 18.7
## 118 0.8891235 0.7504363 76.2 15.2 14558 7 15.2
## 119 0.6854305 0.3496042 74.0 11.6 6850 120 35.8
## 120 0.2258065 1.0326087 55.1 9.3 1123 480 137.8
## 121 1.4967320 0.9137303 65.9 8.6 4608 200 12.1
## 122 0.9680233 0.8587127 64.8 11.3 9418 130 54.9
## 123 0.4633508 0.9173364 69.6 12.4 2311 190 73.7
## 124 0.9690608 0.8286119 81.6 17.9 45435 6 6.2
## 125 0.9968520 0.8401084 81.8 19.2 32689 8 25.3
## 126 1.0287206 0.5902864 74.9 11.5 4457 100 100.8
## 127 0.3076923 0.4459309 61.4 5.4 908 630 204.8
## 128 NA 0.7566719 52.8 9.0 5341 560 119.6
## 129 1.0072389 0.8908297 81.6 17.5 64992 4 7.8
## 130 0.8266200 0.3510896 76.8 13.6 34858 11 10.6
## 131 0.4186551 0.2967431 66.2 7.8 4866 170 27.3
## 132 NA NA 72.7 13.7 13496 NA NA
## 133 0.9074074 0.2319277 72.9 13.0 4699 NA 45.8
## 134 1.0821643 0.5990220 77.6 13.3 18192 85 78.5
## 135 0.5241379 0.9527027 62.6 9.9 2463 220 62.1
## 136 0.8558140 0.6568396 72.9 11.9 7643 110 67.0
## 137 0.8517398 0.8080569 74.6 13.1 11015 89 50.7
## 138 1.0345369 0.6411543 68.2 11.3 7915 120 46.8
## 139 0.9286550 0.7534669 77.4 15.5 23177 3 12.2
## 140 0.9896266 0.8293051 80.9 16.3 25757 8 12.6
## 141 1.1305085 0.5319372 78.2 13.8 123124 6 9.5
## 142 0.9358696 0.7503852 74.7 14.2 18108 33 31.0
## 143 0.9686486 0.7963738 70.1 14.7 22352 24 25.7
## 144 0.9090909 1.0128957 64.2 10.3 1458 320 33.6
## 145 NA NA 73.8 12.9 20805 NA NA
## 146 NA 0.8228346 75.1 12.6 9765 34 56.3
## 147 NA 0.7141026 72.9 13.4 9937 45 54.5
## 148 1.0716667 0.4023973 73.4 12.9 5327 58 28.3
## 149 NA 0.5822622 66.5 11.3 2918 210 65.1
## 150 0.8605974 0.2579821 74.3 16.3 52821 16 10.2
## 151 0.4675325 0.7500000 66.5 7.9 2188 320 94.4
## 152 0.7934783 0.7307061 74.9 14.4 12190 16 16.9
## 153 1.0045045 NA 73.1 13.4 23300 NA 56.3
## 154 0.4608295 0.9521739 50.9 8.6 1780 1100 100.7
## 155 0.9148148 0.7616580 83.0 15.4 76628 6 6.0
## 156 0.9959799 0.7448980 76.3 15.1 25845 7 15.9
## 157 0.9775510 0.8275316 80.4 16.8 27852 7 0.6
## 158 NA 0.6759494 67.9 9.2 1540 130 64.9
## 159 0.9578393 0.7355372 57.4 13.6 12122 140 50.9
## 160 0.5329670 0.3711083 68.4 11.2 5605 183 38.7
## 161 NA NA 55.7 7.6 2332 730 75.3
## 162 0.9138167 0.7978723 82.6 17.3 32045 4 10.6
## 163 0.9515707 0.4600262 74.9 13.7 9779 29 16.9
## 164 0.7015873 0.8537859 58.5 9.6 3363 506 109.7
## 165 0.6648352 0.4118421 63.5 7.0 3809 360 84.0
## 166 0.9469214 0.5886628 71.1 12.7 15617 130 35.2
## 167 0.8423077 0.6131285 49.0 11.3 5542 310 72.0
## 168 0.9908362 0.8880707 82.2 15.8 45636 4 6.5
## 169 0.9834369 0.8251001 83.0 15.8 56431 6 1.9
## 170 0.7283951 0.1856946 69.6 12.3 2728 49 41.6
## 171 1.0427632 0.7639429 69.4 11.2 2517 44 42.8
## 172 0.5894737 0.9767184 65.0 9.2 2411 410 122.7
## 173 0.8750000 0.7967782 74.4 13.5 13323 26 41.0
## 174 0.7230216 0.6385185 75.4 13.4 11780 7 18.3
## 175 NA 0.4842520 68.2 11.7 5363 270 52.2
## 176 0.3995037 0.9913899 59.7 12.2 1228 450 91.5
## 177 0.9909400 0.7171582 72.8 14.7 5069 120 18.1
## 178 0.9802956 0.7019868 70.4 12.3 26090 84 34.8
## 179 0.7114967 0.3540197 74.8 14.6 10404 46 4.6
## 180 0.6500000 0.4152542 75.3 14.5 18677 20 30.9
## 181 NA 0.6098830 65.6 10.8 13066 61 18.0
## 182 0.6835821 0.9570707 58.5 9.8 1613 360 126.6
## 183 0.9562044 0.7952167 71.0 15.1 8178 23 25.7
## 184 1.1944444 0.5054348 77.0 13.3 60868 8 27.6
## 185 0.9989990 0.8107715 80.7 16.2 39267 8 25.8
## 186 1.0031646 0.8171263 79.1 16.5 52947 28 31.0
## 187 1.0815109 0.7239583 77.2 15.5 19283 14 58.3
## 188 NA 0.6362434 68.4 11.5 5567 36 38.8
## 189 NA 0.7687500 71.9 10.6 2803 86 44.8
## 190 1.1141732 0.6452020 74.2 14.2 16159 110 83.2
## 191 0.8342697 0.8880779 75.8 11.9 5092 49 29.0
## 192 0.8333333 0.6558018 71.5 12.2 14301 210 47.4
## 193 0.3220974 0.3518006 63.8 9.2 3519 270 47.0
## 194 0.5863636 0.8539720 60.1 13.5 3734 280 125.4
## 195 0.7854839 0.9275362 57.5 10.9 1615 470 60.3
## rep.parl comp
## 1 27.6 TRUE
## 2 20.7 TRUE
## 3 25.7 TRUE
## 4 50.0 FALSE
## 5 36.8 FALSE
## 6 25.7 FALSE
## 7 14.0 TRUE
## 8 36.8 TRUE
## 9 10.7 TRUE
## 10 30.5 TRUE
## 11 30.3 TRUE
## 12 15.6 TRUE
## 13 16.7 TRUE
## 14 15.0 TRUE
## 15 20.0 TRUE
## 16 19.6 TRUE
## 17 30.1 TRUE
## 18 42.4 TRUE
## 19 13.3 TRUE
## 20 8.4 TRUE
## 21 8.3 TRUE
## 22 51.8 TRUE
## 23 19.3 TRUE
## 24 9.5 TRUE
## 25 9.6 TRUE
## 26 NA FALSE
## 27 20.4 TRUE
## 28 13.3 TRUE
## 29 34.9 TRUE
## 30 20.8 FALSE
## 31 19.0 TRUE
## 32 27.1 TRUE
## 33 28.2 TRUE
## 34 12.5 TRUE
## 35 14.9 TRUE
## 36 15.8 TRUE
## 37 23.6 TRUE
## 38 20.9 TRUE
## 39 3.0 FALSE
## 40 11.5 TRUE
## 41 8.2 TRUE
## 42 33.3 TRUE
## 43 9.2 TRUE
## 44 25.8 TRUE
## 45 48.9 TRUE
## 46 12.5 TRUE
## 47 18.9 TRUE
## 48 38.0 TRUE
## 49 12.7 FALSE
## 50 21.9 FALSE
## 51 19.1 TRUE
## 52 18.7 TRUE
## 53 41.6 TRUE
## 54 2.2 TRUE
## 55 27.4 TRUE
## 56 19.7 FALSE
## 57 22.0 FALSE
## 58 19.8 TRUE
## 59 25.5 TRUE
## 60 19.0 TRUE
## 61 14.0 TRUE
## 62 42.5 TRUE
## 63 25.7 TRUE
## 64 16.2 TRUE
## 65 9.4 TRUE
## 66 11.3 TRUE
## 67 36.9 TRUE
## 68 10.9 TRUE
## 69 21.0 TRUE
## 70 25.0 FALSE
## 71 13.3 TRUE
## 72 21.9 FALSE
## 73 13.7 FALSE
## 74 31.3 TRUE
## 75 3.5 TRUE
## 76 25.8 TRUE
## 77 NA FALSE
## 78 10.1 TRUE
## 79 41.3 TRUE
## 80 12.2 TRUE
## 81 17.1 TRUE
## 82 3.1 TRUE
## 83 26.5 TRUE
## 84 19.9 TRUE
## 85 22.5 TRUE
## 86 30.1 TRUE
## 87 16.7 TRUE
## 88 11.6 TRUE
## 89 11.6 TRUE
## 90 20.1 TRUE
## 91 20.8 TRUE
## 92 8.7 FALSE
## 93 16.3 TRUE
## 94 1.5 TRUE
## 95 23.3 TRUE
## 96 25.0 FALSE
## 97 27.0 TRUE
## 98 18.0 TRUE
## 99 3.1 TRUE
## 100 26.8 TRUE
## 101 10.7 TRUE
## 102 16.0 TRUE
## 103 20.0 FALSE
## 104 23.4 TRUE
## 105 28.3 TRUE
## 106 20.5 FALSE
## 107 16.7 TRUE
## 108 14.2 TRUE
## 109 5.9 TRUE
## 110 9.5 TRUE
## 111 13.0 TRUE
## 112 22.2 TRUE
## 113 11.6 TRUE
## 114 37.1 TRUE
## 115 0.0 FALSE
## 116 20.8 TRUE
## 117 14.9 TRUE
## 118 17.3 TRUE
## 119 11.0 TRUE
## 120 39.6 TRUE
## 121 4.7 TRUE
## 122 37.7 TRUE
## 123 29.5 TRUE
## 124 36.9 TRUE
## 125 31.4 TRUE
## 126 39.1 TRUE
## 127 13.3 TRUE
## 128 6.6 FALSE
## 129 39.6 TRUE
## 130 9.6 TRUE
## 131 19.7 TRUE
## 132 10.3 FALSE
## 133 NA FALSE
## 134 19.3 TRUE
## 135 2.7 TRUE
## 136 16.8 TRUE
## 137 22.3 TRUE
## 138 27.1 TRUE
## 139 22.1 TRUE
## 140 31.3 TRUE
## 141 0.0 TRUE
## 142 12.0 TRUE
## 143 14.5 TRUE
## 144 57.5 TRUE
## 145 6.7 FALSE
## 146 20.7 FALSE
## 147 13.0 FALSE
## 148 6.1 TRUE
## 149 18.2 FALSE
## 150 19.9 TRUE
## 151 42.7 TRUE
## 152 34.0 TRUE
## 153 43.8 FALSE
## 154 12.4 TRUE
## 155 25.3 TRUE
## 156 18.7 TRUE
## 157 27.7 TRUE
## 158 2.0 FALSE
## 159 40.7 TRUE
## 160 17.5 TRUE
## 161 24.3 FALSE
## 162 38.0 TRUE
## 163 5.8 TRUE
## 164 22.5 TRUE
## 165 23.8 TRUE
## 166 11.8 TRUE
## 167 14.7 TRUE
## 168 43.6 TRUE
## 169 28.5 TRUE
## 170 12.4 TRUE
## 171 15.2 TRUE
## 172 36.0 TRUE
## 173 6.1 TRUE
## 174 33.3 TRUE
## 175 38.5 FALSE
## 176 17.6 TRUE
## 177 0.0 TRUE
## 178 24.7 TRUE
## 179 31.3 TRUE
## 180 14.4 TRUE
## 181 25.8 FALSE
## 182 35.0 TRUE
## 183 11.8 TRUE
## 184 17.5 TRUE
## 185 23.5 TRUE
## 186 19.4 TRUE
## 187 11.5 TRUE
## 188 16.4 FALSE
## 189 0.0 FALSE
## 190 17.0 TRUE
## 191 24.3 TRUE
## 192 21.8 TRUE
## 193 0.7 TRUE
## 194 12.7 TRUE
## 195 35.1 TRUE
human_ <- filter(human, complete.cases(human))
complete.cases(human_)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [43] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [57] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [71] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [85] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [99] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [113] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [127] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [141] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [155] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#Remove the observations which relate to regions instead of countries
dim(human_)
## [1] 162 9
tail(human_, n=10)
## Country edu2.FM lab.FM Life.exp
## 153 United Arab Emirates 1.1944444 0.5054348 77.0
## 154 United Kingdom 0.9989990 0.8107715 80.7
## 155 United States 1.0031646 0.8171263 79.1
## 156 Uruguay 1.0815109 0.7239583 77.2
## 157 Venezuela (Bolivarian Republic of) 1.1141732 0.6452020 74.2
## 158 Viet Nam 0.8342697 0.8880779 75.8
## 159 World 0.8333333 0.6558018 71.5
## 160 Yemen 0.3220974 0.3518006 63.8
## 161 Zambia 0.5863636 0.8539720 60.1
## 162 Zimbabwe 0.7854839 0.9275362 57.5
## Years.exp GNI mater.mor adol.birth rep.parl
## 153 13.3 60868 8 27.6 17.5
## 154 16.2 39267 8 25.8 23.5
## 155 16.5 52947 28 31.0 19.4
## 156 15.5 19283 14 58.3 11.5
## 157 14.2 16159 110 83.2 17.0
## 158 11.9 5092 49 29.0 24.3
## 159 12.2 14301 210 47.4 21.8
## 160 9.2 3519 270 47.0 0.7
## 161 13.5 3734 280 125.4 12.7
## 162 10.9 1615 470 60.3 35.1
last <- nrow(human_) - 7
human_ <- human_[1:last,]
dim(human_)
## [1] 155 9
#Define the row names of the data and remove the country name column from the data.
rownames(human_) <- human_$Country
human_ <- dplyr::select(human_, -Country)
dim(human_)
## [1] 155 8
str(human_)
## 'data.frame': 155 obs. of 8 variables:
## $ edu2.FM : num 0.198 0.931 0.861 0.729 0.977 ...
## $ lab.FM : num 0.199 0.685 0.211 0.308 0.633 ...
## $ Life.exp : num 60.4 77.8 74.8 70.6 76.3 74.7 82.4 81.4 70.8 75.4 ...
## $ Years.exp : num 9.3 11.8 14 12 17.9 12.3 20.2 15.7 11.9 12.6 ...
## $ GNI : num 1885 9943 13054 15722 22050 ...
## $ mater.mor : int 400 21 89 155 69 29 6 4 26 37 ...
## $ adol.birth: num 86.8 15.3 10 45.4 54.4 27.1 12.1 4.1 40 28.5 ...
## $ rep.parl : num 27.6 20.7 25.7 14 36.8 10.7 30.5 30.3 15.6 16.7 ...
write.csv(human_, file = "~/IODS-project/data/human_")
#Graphical overview and summaries of the variables
library(GGally); library(ggplot2); library(corrplot)
ggpairs(human_)
cor(human_)%>%corrplot()
#The most strongest correlation is between mater mortality ratio and life expectancy at birth, and the correlation is negative.
#Analysis of PCA without standardized data
pca_human <- prcomp(human_)
biplot(pca_human, choices = 1:2, cex = c(0.8,1), col = c("grey40", "deeppink2"))
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(0, 0, y[, 1L] * 0.8, y[, 2L] * 0.8, col = col[2L], length
## = arrow.len): zero-length arrow is of indeterminate angle and so skipped
#Standardize the variables in the human data and same analysis as above
human_std <- scale(human_)
pca_human2 <- prcomp(human_std)
biplot(pca_human2, choices = 1:2, cex = c(0.8,1), col = c("grey40", "deeppink2"))
library(ggplot2); library(FactoMineR); library(dplyr); library(tidyr)
data(tea)
str(tea)
## 'data.frame': 300 obs. of 36 variables:
## $ breakfast : Factor w/ 2 levels "breakfast","Not.breakfast": 1 1 2 2 1 2 1 2 1 1 ...
## $ tea.time : Factor w/ 2 levels "Not.tea time",..: 1 1 2 1 1 1 2 2 2 1 ...
## $ evening : Factor w/ 2 levels "evening","Not.evening": 2 2 1 2 1 2 2 1 2 1 ...
## $ lunch : Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
## $ dinner : Factor w/ 2 levels "dinner","Not.dinner": 2 2 1 1 2 1 2 2 2 2 ...
## $ always : Factor w/ 2 levels "always","Not.always": 2 2 2 2 1 2 2 2 2 2 ...
## $ home : Factor w/ 2 levels "home","Not.home": 1 1 1 1 1 1 1 1 1 1 ...
## $ work : Factor w/ 2 levels "Not.work","work": 1 1 2 1 1 1 1 1 1 1 ...
## $ tearoom : Factor w/ 2 levels "Not.tearoom",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ friends : Factor w/ 2 levels "friends","Not.friends": 2 2 1 2 2 2 1 2 2 2 ...
## $ resto : Factor w/ 2 levels "Not.resto","resto": 1 1 2 1 1 1 1 1 1 1 ...
## $ pub : Factor w/ 2 levels "Not.pub","pub": 1 1 1 1 1 1 1 1 1 1 ...
## $ Tea : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
## $ How : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
## $ sugar : Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
## $ how : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ where : Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ price : Factor w/ 6 levels "p_branded","p_cheap",..: 4 6 6 6 6 3 6 6 5 5 ...
## $ age : int 39 45 47 23 48 21 37 36 40 37 ...
## $ sex : Factor w/ 2 levels "F","M": 2 1 1 2 2 2 2 1 2 2 ...
## $ SPC : Factor w/ 7 levels "employee","middle",..: 2 2 4 6 1 6 5 2 5 5 ...
## $ Sport : Factor w/ 2 levels "Not.sportsman",..: 2 2 2 1 2 2 2 2 2 1 ...
## $ age_Q : Factor w/ 5 levels "15-24","25-34",..: 3 4 4 1 4 1 3 3 3 3 ...
## $ frequency : Factor w/ 4 levels "1/day","1 to 2/week",..: 1 1 3 1 3 1 4 2 3 3 ...
## $ escape.exoticism: Factor w/ 2 levels "escape-exoticism",..: 2 1 2 1 1 2 2 2 2 2 ...
## $ spirituality : Factor w/ 2 levels "Not.spirituality",..: 1 1 1 2 2 1 1 1 1 1 ...
## $ healthy : Factor w/ 2 levels "healthy","Not.healthy": 1 1 1 1 2 1 1 1 2 1 ...
## $ diuretic : Factor w/ 2 levels "diuretic","Not.diuretic": 2 1 1 2 1 2 2 2 2 1 ...
## $ friendliness : Factor w/ 2 levels "friendliness",..: 2 2 1 2 1 2 2 1 2 1 ...
## $ iron.absorption : Factor w/ 2 levels "iron absorption",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ feminine : Factor w/ 2 levels "feminine","Not.feminine": 2 2 2 2 2 2 2 1 2 2 ...
## $ sophisticated : Factor w/ 2 levels "Not.sophisticated",..: 1 1 1 2 1 1 1 2 2 1 ...
## $ slimming : Factor w/ 2 levels "No.slimming",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ exciting : Factor w/ 2 levels "exciting","No.exciting": 2 1 2 2 2 2 2 2 2 2 ...
## $ relaxing : Factor w/ 2 levels "No.relaxing",..: 1 1 2 2 2 2 2 2 2 2 ...
## $ effect.on.health: Factor w/ 2 levels "effect on health",..: 2 2 2 2 2 2 2 2 2 2 ...
dim(tea)
## [1] 300 36
keep_columns <- c("Tea", "How", "how", "sugar", "where", "lunch")
tea_time <- dplyr::select(tea, one_of(keep_columns))
summary(tea_time)
## Tea How how sugar
## black : 74 alone:195 tea bag :170 No.sugar:155
## Earl Grey:193 lemon: 33 tea bag+unpackaged: 94 sugar :145
## green : 33 milk : 63 unpackaged : 36
## other: 9
## where lunch
## chain store :192 lunch : 44
## chain store+tea shop: 78 Not.lunch:256
## tea shop : 30
##
str(tea_time)
## 'data.frame': 300 obs. of 6 variables:
## $ Tea : Factor w/ 3 levels "black","Earl Grey",..: 1 1 2 2 2 2 2 1 2 1 ...
## $ How : Factor w/ 4 levels "alone","lemon",..: 1 3 1 1 1 1 1 3 3 1 ...
## $ how : Factor w/ 3 levels "tea bag","tea bag+unpackaged",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ sugar: Factor w/ 2 levels "No.sugar","sugar": 2 1 1 2 1 1 1 1 1 1 ...
## $ where: Factor w/ 3 levels "chain store",..: 1 1 1 1 1 1 1 1 2 2 ...
## $ lunch: Factor w/ 2 levels "lunch","Not.lunch": 2 2 2 2 2 2 2 2 2 2 ...
#Visualizing the dataset and multiple correspondence analysis
gather(tea_time) %>% ggplot(aes(value)) + facet_wrap("key", scales = "free") + geom_bar() + theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 8))
## Warning: attributes are not identical across measure variables;
## they will be dropped
maca <- MCA(tea_time, graph = FALSE)
plot(maca, invisible=c("ind"), habillage = "quali")